import pandas as pd
awards = pd.read_csv(".../awards_data.csv")
player_data = pd.read_csv(".../player_stats.csv")
team_data = pd.read_csv(".../team_stats.csv")
rebounding_data = pd.read_csv("...team_rebounding_data_22.csv")
awards
season | nbapersonid | All NBA Defensive First Team | All NBA Defensive Second Team | All NBA First Team | All NBA Second Team | All NBA Third Team | All Rookie First Team | All Rookie Second Team | Bill Russell NBA Finals MVP | ... | all_star_game | rookie_all_star_game | allstar_rk | Defensive Player Of The Year_rk | Most Improved Player_rk | Most Valuable Player_rk | Rookie Of The Year_rk | Sixth Man Of The Year_rk | all_nba_points_rk | all_rookie_points_rk | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2007 | 708.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | True | False | 1.0 | 1.0 | NaN | 3.0 | NaN | NaN | NaN | NaN |
1 | 2007 | 947.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | True | False | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 2007 | 948.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | NaN | NaN | 3.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 2007 | 959.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | True | False | 4.0 | NaN | NaN | 9.0 | NaN | NaN | NaN | NaN |
4 | 2007 | 977.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | True | False | 1.0 | 5.0 | NaN | 1.0 | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4324 | 2015 | 1626170.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 24.0 |
4325 | 2015 | 1626202.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 24.0 |
4326 | 2015 | 1626273.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 24.0 |
4327 | 2018 | 1628971.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 18.0 |
4328 | 2020 | 1630214.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 18.0 |
4329 rows × 23 columns
player_data
nbapersonid | player | draftyear | draftpick | season | nbateamid | team | games | games_start | mins | ... | blk_pct | tov_pct | usg | OWS | DWS | WS | OBPM | DBPM | BPM | VORP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2585 | Zaza Pachulia | 2003 | 42.0 | 2007 | 1610612737 | ATL | 62 | 5 | 944 | ... | 0.010 | 0.181 | 0.183 | 0.2 | 0.9 | 1.1 | -3.9 | -1.3 | -5.1 | -0.7 |
1 | 200780 | Solomon Jones | 2006 | 33.0 | 2007 | 1610612737 | ATL | 35 | 0 | 145 | ... | 0.026 | 0.221 | 0.156 | -0.1 | 0.1 | 0.0 | -6.7 | -2.0 | -8.8 | -0.2 |
2 | 2746 | Josh Smith | 2004 | 17.0 | 2007 | 1610612737 | ATL | 81 | 81 | 2873 | ... | 0.059 | 0.155 | 0.250 | 1.2 | 4.6 | 5.8 | 0.5 | 2.5 | 3.0 | 3.7 |
3 | 201151 | Acie Law | 2007 | 11.0 | 2007 | 1610612737 | ATL | 56 | 6 | 865 | ... | 0.000 | 0.178 | 0.165 | -0.5 | 0.4 | -0.1 | -4.2 | -1.0 | -5.2 | -0.7 |
4 | 101136 | Salim Stoudamire | 2005 | 31.0 | 2007 | 1610612737 | ATL | 35 | 0 | 402 | ... | 0.009 | 0.094 | 0.252 | 0.1 | 0.1 | 0.3 | -1.0 | -2.5 | -3.5 | -0.1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8487 | 1630648 | Jordan Schakel | 2021 | NaN | 2021 | 1610612764 | WAS | 4 | 0 | 30 | ... | 0.000 | 0.078 | 0.191 | -0.2 | 0.0 | -0.1 | -8.6 | -4.4 | -13.0 | -0.1 |
8488 | 1630557 | Corey Kispert | 2021 | 15.0 | 2021 | 1610612764 | WAS | 77 | 36 | 1801 | ... | 0.010 | 0.085 | 0.146 | 1.6 | 0.7 | 2.3 | -0.8 | -1.5 | -2.3 | -0.1 |
8489 | 1628398 | Kyle Kuzma | 2017 | 27.0 | 2021 | 1610612764 | WAS | 66 | 66 | 2204 | ... | 0.022 | 0.141 | 0.242 | 0.0 | 2.0 | 2.0 | 0.2 | -0.4 | -0.2 | 1.0 |
8490 | 203526 | Raul Neto | 2013 | 47.0 | 2021 | 1610612764 | WAS | 70 | 19 | 1372 | ... | 0.002 | 0.139 | 0.184 | 0.7 | 0.8 | 1.5 | -2.5 | -0.5 | -3.0 | -0.4 |
8491 | 1628418 | Thomas Bryant | 2017 | 42.0 | 2021 | 1610612764 | WAS | 27 | 9 | 439 | ... | 0.041 | 0.103 | 0.187 | 0.7 | 0.4 | 1.1 | -0.4 | -0.7 | -1.0 | 0.1 |
8492 rows × 49 columns
team_data
nbateamid | team | season | games | off_rtg | def_rtg | net_rtg | W | L | |
---|---|---|---|---|---|---|---|---|---|
0 | 1610612737 | ATL | 2007 | 82 | 106.9 | 108.9 | -2.0 | 37 | 45 |
1 | 1610612751 | BKN | 2007 | 82 | 104.0 | 109.4 | -5.4 | 34 | 48 |
2 | 1610612738 | BOS | 2007 | 82 | 110.2 | 98.9 | 11.3 | 66 | 16 |
3 | 1610612766 | CHA | 2007 | 82 | 104.6 | 109.4 | -4.8 | 32 | 50 |
4 | 1610612741 | CHI | 2007 | 82 | 103.9 | 107.2 | -3.3 | 33 | 49 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
445 | 1610612758 | SAC | 2021 | 82 | 109.9 | 115.3 | -5.4 | 30 | 52 |
446 | 1610612759 | SAS | 2021 | 82 | 112.4 | 112.3 | 0.1 | 34 | 48 |
447 | 1610612761 | TOR | 2021 | 82 | 112.9 | 110.5 | 2.4 | 48 | 34 |
448 | 1610612762 | UTA | 2021 | 82 | 116.7 | 110.5 | 6.2 | 49 | 33 |
449 | 1610612764 | WAS | 2021 | 82 | 111.1 | 114.5 | -3.4 | 35 | 47 |
450 rows × 9 columns
rebounding_data
team | opp_team | gamedate | game_number | offensive_rebounds | off_rebound_chances | oreb_pct | |
---|---|---|---|---|---|---|---|
0 | BOS | PHI | 2022-10-18 | 1 | 10 | 39 | 0.256410 |
1 | PHI | BOS | 2022-10-18 | 1 | 8 | 42 | 0.190476 |
2 | GSW | LAL | 2022-10-18 | 1 | 16 | 57 | 0.280702 |
3 | LAL | GSW | 2022-10-18 | 1 | 14 | 57 | 0.245614 |
4 | ORL | DET | 2022-10-19 | 1 | 13 | 47 | 0.276596 |
... | ... | ... | ... | ... | ... | ... | ... |
2455 | LAC | PHX | 2023-04-09 | 82 | 18 | 56 | 0.321429 |
2456 | MEM | OKC | 2023-04-09 | 82 | 12 | 55 | 0.218182 |
2457 | POR | GSW | 2023-04-09 | 82 | 11 | 61 | 0.180328 |
2458 | SAC | DEN | 2023-04-09 | 82 | 12 | 50 | 0.240000 |
2459 | MIN | NOP | 2023-04-09 | 82 | 11 | 49 | 0.224490 |
2460 rows × 7 columns
What is the average number of points per game for players in the 2007-2021 seasons who won All NBA First, Second, and Third teams (not the All Defensive Teams), as well as for players who were in the All-Star Game (not the rookie all-star game)?
playerstats = awards.merge(player_data, on='nbapersonid', how='inner')
first_team = playerstats[(playerstats['season_x'] >= 2007) & (playerstats['season_x'] <= 2021) &
(playerstats['All NBA First Team'] == 1.0)]
first_team_ppg = first_team['points'] / first_team['games']
print("First Team points per game:",first_team_ppg.mean())
sec_team = playerstats[(playerstats['season_x'] >= 2007) & (playerstats['season_x'] <= 2021) &
(playerstats['All NBA Second Team'] == 1.0)]
sec_team_ppg = sec_team['points'] / sec_team['games']
print("Second Team points per game:",sec_team_ppg.mean())
third_team = playerstats[(playerstats['season_x'] >= 2007) & (playerstats['season_x'] <= 2021) &
(playerstats['All NBA Third Team'] == 1.0)]
third_team_ppg = third_team['points'] / third_team['games']
print("Third Team points per game:",third_team_ppg.mean())
allstar_team = playerstats[(playerstats['season_x'] >= 2007) & (playerstats['season_x'] <= 2021) &
(playerstats['all_star_game'])]
allstar_team_ppg = allstar_team['points'] / allstar_team['games']
print("All-Star Team points per game:",allstar_team_ppg.mean())
First Team points per game: 22.3096871795066 Second Team points per game: 19.602453597125052 Third Team points per game: 17.406737392538716 All-Star Team points per game: 18.73400715244671
What was the average number of years of experience in the league it takes for players to make their first All NBA Selection (1st, 2nd, or 3rd team)? Please limit your sample to players drafted in 2007 or later who did eventually go on to win at least one All NBA selection.
allnba_players = playerstats[(playerstats['draftyear'] >= 2007) &
((playerstats['All NBA First Team'] == 1.0) |
(playerstats['All NBA Second Team'] == 1.0) |
(playerstats['All NBA Third Team'] == 1.0))]
allnba_columns = ['All NBA First Team', 'All NBA Second Team', 'All NBA Third Team']
allnba_players = allnba_players[allnba_players[allnba_columns].sum(axis=1) > 0]
first_allnba_years = allnba_players.groupby('nbapersonid')['season_x'].min()
years_to_first_allnba = first_allnba_years - allnba_players.groupby('nbapersonid')['draftyear'].min() + 1
print("Average years to first All NBA selection:", years_to_first_allnba.mean())
Average years to first All NBA selection: 4.682926829268292
You're going to work to create a dataset with a "career outcome" for each player, representing the highest level of success that the player achieved for at least two seasons after his first four seasons in the league. On a single season level, the outcomes are:
playerinfo = player_data.merge(awards, on='nbapersonid', how='left')
players_2010_draft = playerinfo[playerinfo['draftyear'] == 2010]
seasongames = team_data.merge(players_2010_draft, on='nbateamid', how='right')
def calculate_career_outcome(player_df):
elite_count = 0
all_star_count = 0
starter_count = 0
rotation_count = 0
roster_count = 0
out_of_league_count = 0
for index, season in player_df.iterrows():
if index >= 4:
adjusted_minutes = season['mins']
adjusted_games_started = season['games']
if (season['All NBA First Team'] > 0) or (season['All NBA Second Team'] > 0) or (season['All NBA Third Team'] > 0):
elite_count += 1
elif season['all_star_game'] == True:
all_star_count += 1
elif (adjusted_games_started >= 41) or (adjusted_minutes >= 2000):
starter_count += 1
elif adjusted_minutes >= 1000 and adjusted_minutes < 2000:
rotation_count += 1
elif adjusted_minutes >= 1:
roster_count += 1
if elite_count >= 2:
return "Elite"
elif all_star_count >= 2:
return "All-Star"
elif starter_count >= 2:
return "Starter"
elif rotation_count >= 2:
return "Rotation"
elif roster_count >= 2:
return "Roster"
else:
return "Out of the League"
results_list = []
for player, data in players_2010_draft.groupby('player'):
career_outcome = calculate_career_outcome(data)
results_list.append({'Player': player, 'Career Outcome': career_outcome})
results = pd.DataFrame(results_list)
outcome_counts = results['Career Outcome'].value_counts()
print(outcome_counts)
Starter 31 Roster 20 Out of the League 18 Elite 3 All-Star 1 Name: Career Outcome, dtype: int64
Making a prediction on which players drafted in 2018 or later, will make the All-Star Game, using the data of players drafted on or before 2015 to train the model
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
training_data = playerstats[playerstats['draftyear'] <= 2015].copy()
training_data['points_per_game'] = training_data['points'] / training_data['games']
X = training_data[['points', 'ast', 'steals', 'blocks', 'tot_reb', 'PER', 'points_per_game']]
y = training_data['all_star_game'].copy() # 1 if a player becomes an All-Star, 0 otherwise
y.fillna(False, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
newplayers = playerstats[playerstats['draftyear'] >= 2018].copy()
newplayers['points_per_game'] = newplayers['points'] / newplayers['games']
new_players_data = newplayers[['points', 'ast', 'steals', 'blocks', 'tot_reb', 'PER', 'points_per_game']]
predictions = model.predict(new_players_data)
newplayers['predicted_all_star'] = predictions
all_star_players = newplayers[newplayers['predicted_all_star'] == True]
grouped_players = all_star_players.groupby('player')['predicted_all_star'].max()
print(grouped_players)
Accuracy: 0.8790214477211796 Classification Report: precision recall f1-score support False 0.89 0.98 0.93 5102 True 0.70 0.30 0.41 866 accuracy 0.88 5968 macro avg 0.79 0.64 0.67 5968 weighted avg 0.86 0.88 0.86 5968 player Ja Morant True Luka Doncic True Luka Dončić True Shai Gilgeous-Alexander True Trae Young True Zion Williamson True Name: predicted_all_star, dtype: bool
Calculate what OKC's predicted offensive rebound percent is for game 81 in the data. That is, use games 1-80 to predict game 81.
okc_data = rebounding_data[rebounding_data['team'] == 'OKC']
average_offensive_rebound_percent = okc_data.iloc[:80]['offensive_rebounds'].sum() / okc_data.iloc[:80]['off_rebound_chances'].sum()
predicted_offensive_rebound_percent = average_offensive_rebound_percent * 100
print("Predicted Offensive Rebound Percentage for Game 81:", predicted_offensive_rebound_percent)
Predicted Offensive Rebound Percentage for Game 81: 28.8689755388714
player_stats = pd.read_csv("C:/Users/shari/OneDrive/Desktop/Job Folder/OKC Project/Datasets/2021playerstats.csv", sep=';', encoding='latin1')
player_stats
Rk | Player | Pos | Age | Tm | G | GS | MP | FG | FGA | ... | FT% | ORB | DRB | TRB | AST | STL | BLK | TOV | PF | PTS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Precious Achiuwa | C | 22 | TOR | 73 | 28 | 23.6 | 3.6 | 8.3 | ... | 0.595 | 2.0 | 4.5 | 6.5 | 1.1 | 0.5 | 0.6 | 1.2 | 2.1 | 9.1 |
1 | 2 | Steven Adams | C | 28 | MEM | 76 | 75 | 26.3 | 2.8 | 5.1 | ... | 0.543 | 4.6 | 5.4 | 10.0 | 3.4 | 0.9 | 0.8 | 1.5 | 2.0 | 6.9 |
2 | 3 | Bam Adebayo | C | 24 | MIA | 56 | 56 | 32.6 | 7.3 | 13.0 | ... | 0.753 | 2.4 | 7.6 | 10.1 | 3.4 | 1.4 | 0.8 | 2.6 | 3.1 | 19.1 |
3 | 4 | Santi Aldama | PF | 21 | MEM | 32 | 0 | 11.3 | 1.7 | 4.1 | ... | 0.625 | 1.0 | 1.7 | 2.7 | 0.7 | 0.2 | 0.3 | 0.5 | 1.1 | 4.1 |
4 | 5 | LaMarcus Aldridge | C | 36 | BRK | 47 | 12 | 22.3 | 5.4 | 9.7 | ... | 0.873 | 1.6 | 3.9 | 5.5 | 0.9 | 0.3 | 1.0 | 0.9 | 1.7 | 12.9 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
807 | 601 | Thaddeus Young | PF | 33 | TOR | 26 | 0 | 18.3 | 2.6 | 5.5 | ... | 0.481 | 1.5 | 2.9 | 4.4 | 1.7 | 1.2 | 0.4 | 0.8 | 1.7 | 6.3 |
808 | 602 | Trae Young | PG | 23 | ATL | 76 | 76 | 34.9 | 9.4 | 20.3 | ... | 0.904 | 0.7 | 3.1 | 3.7 | 9.7 | 0.9 | 0.1 | 4.0 | 1.7 | 28.4 |
809 | 603 | Omer Yurtseven | C | 23 | MIA | 56 | 12 | 12.6 | 2.3 | 4.4 | ... | 0.623 | 1.5 | 3.7 | 5.3 | 0.9 | 0.3 | 0.4 | 0.7 | 1.5 | 5.3 |
810 | 604 | Cody Zeller | C | 29 | POR | 27 | 0 | 13.1 | 1.9 | 3.3 | ... | 0.776 | 1.9 | 2.8 | 4.6 | 0.8 | 0.3 | 0.2 | 0.7 | 2.1 | 5.2 |
811 | 605 | Ivica Zubac | C | 24 | LAC | 76 | 76 | 24.4 | 4.1 | 6.5 | ... | 0.727 | 2.9 | 5.6 | 8.5 | 1.6 | 0.5 | 1.0 | 1.5 | 2.7 | 10.3 |
812 rows × 30 columns
import matplotlib.pyplot as plt
position_counts = player_stats['Pos'].str[:2].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(position_counts, labels=position_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Player Position Distribution')
plt.axis('equal')
plt.show()
player_stats_grouped = player_stats.groupby('Player')['PTS'].mean()
player_stats_sorted = player_stats_grouped.sort_values(ascending=False)
top_30_players = player_stats_sorted.head(30)
ppg = top_30_players
players = top_30_players.index
plt.figure(figsize=(10, 6))
plt.bar(players, ppg, color='teal', alpha=0.9)
plt.title('Top 30 Players by PPG')
plt.xlabel('Player')
plt.ylabel('PPG')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
import seaborn as sns
player_stats['PTS'] = player_stats['PTS'].astype(float) # Convert PPG to float if it's not already
player_stats_sorted = player_stats.sort_values(by='PTS', ascending=False)
top_100_players = player_stats_sorted.head(150)
# Create a pivot table to summarize PPG by position
heatmap_data = top_100_players.pivot_table(values='PTS', index='Pos', aggfunc='mean')
# Create a heatmap using seaborn
plt.figure(figsize=(12, 7))
sns.heatmap(heatmap_data, annot=True, cmap='inferno', fmt='.1f', linewidths=0.2)
plt.title('Heatmap of PPG by Player Position')
plt.xlabel('PPG')
plt.ylabel('Player Position')
plt.show()